utils::View(incdataconinv)


rm(list=ls())
library(foreign)
library(xtable)
library(apsrtable)
library(arm)
library(ggplot2)
library(Amelia)
library(reshape2)
library(plyr)
num = function(x){ as.numeric(char(x)) }
char = function(x){ as.character(x) }


incdata = read.dta("C:/Users/Devin/Documents/R/MLE/incadv.dta", convert.factors=FALSE)
incdatacon <- subset(incdata, statelevel==0)
incdatastate <- subset(incdata, statelevel==1)

####IMPUTATION####


incdataconsmall <- incdatacon[,c("state", "year","dist", "treat", "rv", "rv_treat", "dv_money", "dv_investor1", "dv_investor0")]

a.out <- amelia(x=incdataconsmall, m=5, cs= "state", ts="year", idvar="dist")




b.out=NULL
se.out<-NULL
for(i in 1:5) {
  ols.out <- lm(dv_money ~ rv + rv_treat + treat, data = subset(a.out$imputations[[i]], rv < 5 & rv > -5))
  b.out <- rbind(b.out, ols.out$coef)
  se.out <- rbind(se.out, coef(summary(ols.out))[,2])
}
combined.results <- mi.meld(q = b.out, se = se.out)

b.out=NULL
se.out<-NULL
for(i in 1:5) {
  ols.out <- lm(dv_investor ~ treat + investor + rv + rv_treat + investor*treat, 
                data = subset(reshape(a.out$imputations[[i]], varying =c("dv_investor1", "dv_investor0"),
              timevar = "investor", idvar= c("state", "dist", "year"), direction="long", sep=""), rv < 5 & rv > -5))
  b.out <- rbind(b.out, ols.out$coef)
  se.out <- rbind(se.out, coef(summary(ols.out))[,2])
}
combined.resultsinv <- mi.meld(q = b.out, se = se.out)
missmap(a.out, main="", y.cex=.5)
?missmap


incdataconimp <- subset(reshape(a.out$imputations[[1]], varying =c("dv_investor1", "dv_investor0"),
                                timevar = "investor", 
                                idvar= c("state", "dist", "year"), direction="long", sep=""), rv < 5 & rv > -5)

incdataconimp2 <- subset(reshape(a.out$imputations[[2]], varying =c("dv_investor1", "dv_investor0"),
                                 timevar = "investor", 
                                 idvar= c("state", "dist", "year"), direction="long", sep=""), rv < 5 & rv > -5)

incdataconimp3 <- subset(reshape(a.out$imputations[[3]], varying =c("dv_investor1", "dv_investor0"),
                                 timevar = "investor", 
                                 idvar= c("state", "dist", "year"), direction="long", sep=""), rv < 5 & rv > -5)

incdataconimp4 <- subset(reshape(a.out$imputations[[4]], varying =c("dv_investor1", "dv_investor0"),
                                 timevar = "investor", 
                                 idvar= c("state", "dist", "year"), direction="long", sep=""), rv < 5 & rv > -5)

incdataconimp5 <- subset(reshape(a.out$imputations[[5]], varying =c("dv_investor1", "dv_investor0"),
                                 timevar = "investor", 
                                 idvar= c("state", "dist", "year"), direction="long", sep=""), rv < 5 & rv > -5)

vcov(incdataconimp1)


#mi.melt of each model object
#check variance covariance matrix
#miss map
######REPLICATION###########

incdatacon2 <- subset(incdatacon, !is.na(incdatacon$dv_money))
incdatacon3 <- subset(incdatacon2, !is.na(incdatacon2$rv))
incdataconrd <- subset(incdatacon3, rv < 5 & rv >= -5)
incdataconinv <- reshape(incdataconrd, varying =c("dv_investor1", "dv_investor0"), timevar = "investor", 
                         idvar= c("state", "dist", "year"), direction="long", sep="")




mod1=lm(dv_money ~ rv + rv_treat + treat, data=incdataconrd)
summary(mod1)

mod1imp = lm(dv_money ~ rv + rv_treat + treat, data=incdataconimp)
summary(mod1imp)


modinv = lm(dv_investor ~ treat + investor + rv + rv_treat + investor*treat, data=incdataconinv)
summary(modinv)

modinvimp = lm(dv_investor ~ treat + investor + rv + rv_treat + investor*treat, data=incdataconimp)
modinvimp2 = lm(dv_investor ~ treat + investor + rv + rv_treat + investor*treat, data=incdataconimp2)
modinvimp3 = lm(dv_investor ~ treat + investor + rv + rv_treat + investor*treat, data=incdataconimp3)
modinvimp4 = lm(dv_investor ~ treat + investor + rv + rv_treat + investor*treat, data=incdataconimp4)
modinvimp5 = lm(dv_investor ~ treat + investor + rv + rv_treat + investor*treat, data=incdataconimp5)

vcov(modinvimp1)
vcov(modinvimp2)
vcov(modinvimp3)
vcov(modinvimp4)
vcov(modinvimp5)

summary(modinvimp)
####CROSS VALIDATION TREAT####
set.seed=6886
k=100
incdataconimp$rand = sample(1:k, nrow(incdataconimp),  replace=TRUE)

rmse = function(pred, actual){sqrt(mean((pred-actual)^2))}


incdataconimp$investor_treat = incdataconimp$investor*incdataconimp$treat
dv='dv_money'
ivs=c('rv', 'rv_treat', 'treat')
modForm=formula(paste0( dv, ' ~ ', paste(ivs, collapse=' + ') ))

coefCrossVal=NULL
perf=NULL

for(ii in 1:k){print;
               train=incdataconimp[incdataconimp$rand!=ii,]
               test=incdataconimp[incdataconimp$rand==ii,]
               
               trainRes=summary(lm(dv_money ~ rv + rv_treat + treat, data=train)) $'coefficients'[,1:2]
               trainRes=cbind(trainRes, ii)
               coefCrossVal=rbind(coefCrossVal, trainRes)
               
               preds = trainRes[,1] %*% t(data.matrix(cbind(1, test[,ivs])))
               perf = c(perf, rmse(preds, test$dv_money))
}

perfdf = as.data.frame(cbind(1,perf))
tmp <- ggplot(perfdf, aes(x=perf)) + geom_density(alpha=.3, fill="#009E73")+xlab("Root Mean Squared Error") + 
  ylab("Density") + ggtitle("Model A") + 
  theme(plot.title = element_text(size=40, lineheight=2, face="bold"), axis.text=element_text(size=16),
                                  axis.title=element_text(size=18)) 
tmp

#line plot for rmse/density plot
ggData = data.frame ( rownames(coefCrossVal), coefCrossVal, row.names=NULL)


ggData = ggData[-c(2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,34,35,38,39),]
colnames(ggData)=c('var', 'est', 'stderr', 'rand')

for(ii in 2:3){ ggData[,ii] = num(ggData[,ii]) }

ggData$hi95 = ggData$est + qnorm(.975)*ggData$stderr
ggData$lo95 = ggData$est - qnorm(.975)*ggData$stderr
ggData$hi90 = ggData$est + qnorm(.95)*ggData$stderr
ggData$lo90 = ggData$est - qnorm(.95)*ggData$stderr

# Lets make out plot now
tmp = ggplot(ggData, aes(x=factor(rand), y=est))
tmp = tmp + geom_point()
tmp = tmp + geom_linerange(aes(ymin=lo95, ymax=hi95), lwd=1)
tmp = tmp + geom_linerange(aes(ymin=lo90, ymax=hi90), lwd=2)
tmp = tmp + facet_wrap(~var, scales='free_y')
tmp = tmp + geom_hline(aes(y=0), color='red', linetype=2)


mf_labeller <- function(var, value){
  value <- as.character(value)
  if (var=="var") { 
    value[value=="(Intercept)"] <- "Intercept"
    value[value=="treat"]   <- "Democrat Win"
  }
  return(value)
}

tmp = tmp + facet_grid(. ~ var, labeller=mf_labeller)
tmp1 = tmp + ylab("Estimate") + xlab("Fold Number") + ggtitle("Model A") + 
  theme(plot.title = element_text(lineheight=.8, face="bold"))
tmp1


######CROSS VALIDATION INV######
set.seed=6886
k=100
incdataconimp$rand = sample(1:k, nrow(incdataconimp),  replace=TRUE)

rmse = function(pred, actual){sqrt(mean((pred-actual)^2))}


incdataconimp$investor_treat = incdataconimp$investor*incdataconimp$treat
dv='dv_investor'
ivs=c('treat', 'investor', 'rv', 'rv_treat', 'investor_treat')
modForm=formula(paste0( dv, ' ~ ', paste(ivs, collapse=' + ') ))

coefCrossVal=NULL
perf=NULL

for(ii in 1:k){print;
               train=incdataconimp[incdataconimp$rand!=ii,]
               test=incdataconimp[incdataconimp$rand==ii,]
               
               trainRes=summary(lm(dv_investor ~ treat + investor + rv + rv_treat + investor_treat, data=train)) $'coefficients'[,1:2]
               trainRes=cbind(trainRes, ii)
               coefCrossVal=rbind(coefCrossVal, trainRes)
               
               preds = trainRes[,1] %*% t(data.matrix(cbind(1, test[,ivs])))
               perf = c(perf, rmse(preds, test$dv_investor))
}

perfdf = as.data.frame(cbind(1,perf))
tmp <- ggplot(perfdf, aes(x=perf)) + geom_density(alpha=.3, fill="#009E73")+xlab("Root Mean Squared Error") + 
  ylab("Density") + ggtitle("Model B") + 
  theme(plot.title = element_text(size=40, lineheight=2, face="bold"), axis.text=element_text(size=16),
        axis.title=element_text(size=18)) 
tmp

#line plot for rmse/density plot
ggData = data.frame ( rownames(coefCrossVal), coefCrossVal, row.names=NULL)
rmse(preds, incdataconimp$dv_investor)

ggData = ggData[-c(4,5,10,11,16,17,22,23,28,29,34,35,40,41,46,47,52,53,58,59),]
colnames(ggData)=c('var', 'est', 'stderr', 'rand')

for(ii in 2:3){ ggData[,ii] = num(ggData[,ii]) }
ggData$est

ggData$hi95 = ggData$est + qnorm(.975)*ggData$stderr
ggData$lo95 = ggData$est - qnorm(.975)*ggData$stderr
ggData$hi90 = ggData$est + qnorm(.95)*ggData$stderr
ggData$lo90 = ggData$est - qnorm(.95)*ggData$stderr

# Lets make out plot now
tmp = ggplot(ggData, aes(x=factor(rand), y=est))
tmp = tmp + geom_point()
tmp = tmp + geom_linerange(aes(ymin=lo95, ymax=hi95), lwd=1)
tmp = tmp + geom_linerange(aes(ymin=lo90, ymax=hi90), lwd=2)
tmp = tmp + facet_wrap(~var, scales='free_y')
tmp = tmp + geom_hline(aes(y=0), color='red', linetype=2)


mf_labeller <- function(var, value){
  value <- as.character(value)
  if (var=="var") { 
    value[value=="(Intercept)"] <- "Intercept"
    value[value=="treat"]   <- "Democrat Win"
    value[value=="investor"]   <- "Investor"
    value[value=="investor_treat"]   <- "Investor*Democratic Win"
  }
  return(value)
}

tmp1 = tmp + ylab("Estimate") + xlab("Fold Number") + ggtitle("Model B") + 
  theme(plot.title = element_text(lineheight=.8, face="bold"))
tmp1

####PLOT####
require(ggplot2)
?geo
plot <- qplot(rv, dv_money, group = rv > 0, geom = c('point', 'smooth'), 
              method = 'lm', se = F, data = incdataconrd )
plot2 <- plot + geom_vline(aes(xintercept=0), colour="#990000", linetype="dashed", size=2) +  geom_point(shape=2,
alpha=1/20) + xlab("Democratic Party's Vote Share Winning Margin") + ylab("Democratic Party's Share of Contributions") +
  ggtitle("Effect of Incumbency on Democrats' Share\n of Contributions in the U.S. House")+
  geom_smooth(method=lm, size=2)
plot2
#####COEFPLOTS####

##first model


ggData = data.frame(variable.names(mod1), mod1$coefficients, coef(summary(mod1))[,2])
ggData = ggData[-c(2,3),]
colnames(ggData)=c('var', 'est', 'stderr')
ggData$var <- c("Intercept", "Democrat Win")
ggData$var <- factor(ggData$var, levels=unique(ggData$var))


ggData$hi95 = ggData$est + qnorm(.975)*ggData$stderr
ggData$lo95 = ggData$est - qnorm(.975)*ggData$stderr
ggData$hi90 = ggData$est + qnorm(.95)*ggData$stderr
ggData$lo90 = ggData$est - qnorm(.95)*ggData$stderr


tmp = ggplot(ggData, aes(x=var, y=est))
tmp = tmp + geom_point()
tmp = tmp + geom_linerange(aes(ymin=lo95, ymax=hi95), lwd=1)
tmp = tmp + geom_linerange(aes(ymin=lo90, ymax=hi90), lwd=2)


tmp = tmp + geom_hline(aes(y=0), color='red', linetype=2)
tmp = tmp + ylab("Estimate") + xlab("Variable") + ggtitle("Model A") + 
  theme(plot.title = element_text(size=40, lineheight=2, face="bold"), axis.text=element_text(size=16),
        axis.title=element_text(size=18)) 
tmp


##second model




ggData = data.frame(variable.names(modinv), modinv$coefficients, coef(summary(modinv))[,2])
ggData = ggData[-c(4,5),]
colnames(ggData)=c('var', 'est', 'stderr')
ggData$var <- c("Intercept", "Democrat Win", "Investor", "Investor*Democrat Win")
ggData$var <- factor(ggData$var, levels=unique(ggData$var))

ggData$hi95 = ggData$est + qnorm(.975)*ggData$stderr
ggData$lo95 = ggData$est - qnorm(.975)*ggData$stderr
ggData$hi90 = ggData$est + qnorm(.95)*ggData$stderr
ggData$lo90 = ggData$est - qnorm(.95)*ggData$stderr


tmp = ggplot(ggData, aes(x=var, y=est))
tmp = tmp + geom_point()
tmp = tmp + geom_linerange(aes(ymin=lo95, ymax=hi95), lwd=1)
tmp = tmp + geom_linerange(aes(ymin=lo90, ymax=hi90), lwd=2)



tmp = tmp + geom_hline(aes(y=0), color='red', linetype=2)
tmp = tmp + ylab("Estimate") + xlab("Variable") + ggtitle("Model B") + 
  theme(plot.title = element_text(size=40, lineheight=2, face="bold"), axis.text=element_text(size=16),
        axis.title=element_text(size=18)) 
tmp

#first model imputed
ggData = data.frame(colnames(combined.results$q.mi), combined.results$q.mi[1,], combined.results$se.mi[1,])
ggData = ggData[-c(2,3),]

colnames(ggData)=c('var', 'est', 'stderr')
ggData$var <- c("Intercept", "Democrat Win")
ggData$var <- factor(ggData$var, levels=unique(ggData$var))

ggData$hi95 = ggData$est + qnorm(.975)*ggData$stderr
ggData$lo95 = ggData$est - qnorm(.975)*ggData$stderr
ggData$hi90 = ggData$est + qnorm(.95)*ggData$stderr
ggData$lo90 = ggData$est - qnorm(.95)*ggData$stderr


tmp = ggplot(ggData, aes(x=var, y=est))
tmp = tmp + geom_point()
tmp = tmp + geom_linerange(aes(ymin=lo95, ymax=hi95), lwd=1)
tmp = tmp + geom_linerange(aes(ymin=lo90, ymax=hi90), lwd=2)

tmp = tmp + geom_hline(aes(y=0), color='red', linetype=2)
tmp = tmp + ylab("Estimate") + xlab("Variable") + ggtitle("Model A") + 
  theme(plot.title = element_text(size=40, lineheight=2, face="bold"), axis.text=element_text(size=16),
        axis.title=element_text(size=18)) 
tmp


#second model imputed
ggData = data.frame(colnames(combined.resultsinv$q.mi), combined.resultsinv$q.mi[1,], combined.resultsinv$se.mi[1,])
ggData = ggData[-c(4,5),]
colnames(ggData)=c('var', 'est', 'stderr')
ggData$var <- c("Intercept", "Democrat Win", "Investor", "Investor*Democrat Win")
ggData$var <- factor(ggData$var, levels=unique(ggData$var))

ggData$hi95 = ggData$est + qnorm(.975)*ggData$stderr
ggData$lo95 = ggData$est - qnorm(.975)*ggData$stderr
ggData$hi90 = ggData$est + qnorm(.95)*ggData$stderr
ggData$lo90 = ggData$est - qnorm(.95)*ggData$stderr

tmp = ggplot(ggData, aes(x=var, y=est))
tmp = tmp + geom_point()
tmp = tmp + geom_linerange(aes(ymin=lo95, ymax=hi95), lwd=1)
tmp = tmp + geom_linerange(aes(ymin=lo90, ymax=hi90), lwd=2)

tmp3 = tmp + geom_hline(aes(y=0), color='red', linetype=2) + ylab("Estimate") + xlab("Variable")+ ggtitle("Model B") + 
  theme(plot.title = element_text(size=40, lineheight=2, face="bold"), axis.text=element_text(size=16),
        axis.title=element_text(size=18)) 
tmp3


####Simulation treat


treatRange = c(0,1)
scen = with(data=incdataconimp, cbind(1, mean(rv), mean(rv_treat), treatRange) )

# Calculate predicted values
library(MASS)
sims = 1000
draws = mvrnorm(sims, coef(mod1imp), vcov(mod1imp))
predVals = draws %*% t(scen)

plot(density(predVals[,1]), col='red', xlim=c(32,70))
lines(density(predVals[,2]), col='blue')

rmse = sqrt(mean( resid(mod1)^2 )  )
predSto = apply(predVals, 2, function(x) rnorm(sims, x, rmse))


modelExp = predSto
head(modelExp)
colnames(modelExp)=1:ncol(modelExp)

modelExp2=melt(modelExp)[,-1]
ggMeans = ddply(modelExp2, .(Var2), summarise, sMean=mean(value))
ggDensity = ddply(modelExp2, .(Var2), .fun=function(x){
  tmp = density(x$value); x1 = tmp$x; y1 = tmp$y
  q95 = x1 >= quantile(x$value,0.025) & x1 <= quantile(x$value,0.975)
  q90 = x1 >= quantile(x$value,0.05) & x1 <= quantile(x$value,0.95)
  data.frame(x=x1,y=y1,q95=q95, q90=q90) } )

ggMeans$Var2 = as.factor(ggMeans$Var2)
ggDensity$Var2 = as.factor(ggDensity$Var2)
temp = ggplot()
temp = temp + geom_line(data=ggDensity, aes(x=x,y=y,color=Var2))

temp = temp + geom_ribbon(data=subset(ggDensity,q95),
                          aes(x=x,ymax=y,fill=Var2),ymin=0,alpha=0.3)
temp = temp + geom_ribbon(data=subset(ggDensity,q90),
                          aes(x=x,ymax=y,fill=Var2),ymin=0,alpha=.8)
temp = temp + geom_vline(data=ggMeans,
                         aes(xintercept=sMean),linetype='dotted',size=1, color="#000000")
temp = temp + theme(legend.position='none') + ylab("Density") + 
  xlab("Democratic Share of Contributions") + ggtitle("Model A") +
  theme(plot.title = element_text(lineheight=.8, face="bold"))

temp




####Simulation inv

treatinvRange = c(0,1)
scen = with(data=incdataconimp, cbind(1, mean(treat), mean(investor), mean(rv), mean(rv_treat), treatinvRange) )

# Calculate predicted values
library(MASS)
sims = 1000
draws = mvrnorm(sims, coef(modinvimp), vcov(modinvimp))
predVals = draws %*% t(scen)

plot(density(predVals[,1]), col='red', xlim=c(32,70))
lines(density(predVals[,2]), col='blue')

# Stochastic uncertainty
rmse = sqrt(mean( resid(modinvimp)^2 )  )
predSto = apply(predVals, 2, function(x) rnorm(sims, x, rmse))


modelExp = predSto
colnames(modelExp)=1:ncol(modelExp)

modelExp2=melt(modelExp)[,-1]
ggMeans = ddply(modelExp2, .(Var2), summarise, sMean=mean(value))
ggDensity = ddply(modelExp2, .(Var2), .fun=function(x){
  tmp = density(x$value); x1 = tmp$x; y1 = tmp$y
  q95 = x1 >= quantile(x$value,0.025) & x1 <= quantile(x$value,0.975)
  q90 = x1 >= quantile(x$value,0.05) & x1 <= quantile(x$value,0.95)
  data.frame(x=x1,y=y1,q95=q95, q90=q90) } )

ggMeans$Var2 = as.factor(ggMeans$Var2)
ggDensity$Var2 = as.factor(ggDensity$Var2)
temp = ggplot()
temp = temp + geom_line(data=ggDensity, aes(x=x,y=y,color=Var2))

temp = temp + geom_ribbon(data=subset(ggDensity,q95),
                          aes(x=x,ymax=y,fill=Var2),ymin=0,alpha=0.3)
temp = temp + geom_ribbon(data=subset(ggDensity,q90),
                          aes(x=x,ymax=y,fill=Var2),ymin=0,alpha=.8)
temp = temp + geom_vline(data=ggMeans,
                         aes(xintercept=sMean),linetype='dotted',size=1, color="#000000")
temp = temp + theme(legend.position='none')+ ylab("Density") + 
  xlab("Democratic Share of Contributions") + ggtitle("Model B") + 
  theme(plot.title = element_text(lineheight=.8, face="bold"))
temp


